Findings

library(tidyverse)
## ── Attaching packages ────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.0     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.2
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.5.2
## Warning: package 'tibble' was built under R version 3.5.2
## Warning: package 'tidyr' was built under R version 3.5.2
## Warning: package 'purrr' was built under R version 3.5.2
## Warning: package 'dplyr' was built under R version 3.5.2
## Warning: package 'stringr' was built under R version 3.5.2
## Warning: package 'forcats' was built under R version 3.5.2
## ── Conflicts ───────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(igraph)
## Warning: package 'igraph' was built under R version 3.5.2
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(tidygraph)
## Warning: package 'tidygraph' was built under R version 3.5.2
## 
## Attaching package: 'tidygraph'
## The following object is masked from 'package:igraph':
## 
##     groups
## The following object is masked from 'package:stats':
## 
##     filter
library(ggraph)
## Warning: package 'ggraph' was built under R version 3.5.2
# library(smglr)
library(broom)
## Warning: package 'broom' was built under R version 3.5.2
library(ggforce)
## Warning: package 'ggforce' was built under R version 3.5.2
library(tictoc)

data_folder = '../data/'
plots_folder = '../plots/'

## Load data ----
load(str_c(data_folder, '01_parsed.Rdata'))

There are 203 PhD programs producing graduate students in the data

univ_df %>%
    filter(total_placements > 0) %>%
    nrow()
## [1] 203

Finding: 37 PhD programs (37/203 = 18%) produce about 50% of permanent placements

ggplot(univ_df, aes(perm_placement_rank, frac_cum_perm_placements)) + 
    geom_step() +
    scale_x_continuous(labels = scales::percent_format(), 
                       name = 'PhD Programs') +
    scale_y_continuous(labels = scales::percent_format(), 
                       name = 'Permanent Placements')
## Warning: Removed 23 rows containing missing values (geom_path).

univ_df %>%
    filter(frac_cum_perm_placements <= .5) %>%
    arrange(perm_placement_rank) %>%
    mutate(perm_placement_rank = row_number()) %>%
    select(perm_placement_rank, univ_name, 
           perm_placements, frac_cum_perm_placements) %>%
    knitr::kable()
perm_placement_rank univ_name perm_placements frac_cum_perm_placements
1 University of Oxford 34 0.0270056
2 Princeton University 28 0.0492454
3 Graduate Center of the City University of New York 25 0.0691025
4 Columbia University 25 0.0889595
5 University of Notre Dame 23 0.1072280
6 Rutgers University 22 0.1247021
7 Stanford University 20 0.1405878
8 Villanova University 20 0.1564734
9 Boston College 19 0.1715647
10 University of Toronto 19 0.1866561
11 University of Chicago 18 0.2009531
12 Emory University 18 0.2152502
13 Yale University 18 0.2295473
14 Katholieke Universiteit Leuven 18 0.2438443
15 University of North Carolina at Chapel Hill 17 0.2573471
16 Duquesne University 16 0.2700556
17 University of Michigan 16 0.2827641
18 Baylor University 16 0.2954726
19 University of California, Berkeley 15 0.3073868
20 University of Wisconsin-Madison 15 0.3193010
21 Fordham University 14 0.3304210
22 Saint Louis University 14 0.3415409
23 University of Pittsburgh 14 0.3526608
24 DePaul University 14 0.3637808
25 New York University 14 0.3749007
26 Stony Brook University 14 0.3860207
27 Harvard University 14 0.3971406
28 University of California, Riverside 13 0.4074662
29 University of South Florida 13 0.4177919
30 University of Southern California 13 0.4281176
31 Indiana University Bloomington 13 0.4384432
32 University of Virginia 13 0.4487689
33 Australian National University 13 0.4590945
34 University of Edinburgh 13 0.4694202
35 University of Pittsburgh (HPS) 13 0.4797458
36 Pennsylvania State University 12 0.4892772
37 University of Texas at Austin 12 0.4988086
## Placements at PhD-producing programs
grad_programs = univ_df %>% 
    filter(total_placements > 0, country %in% c('U.S.')) %>% 
    pull(univ_id)
# gp2 = individual_df %>% 
#     count(placing_univ_id) %>% 
#     pull(placing_univ_id)

individual_df %>% 
    filter(permanent, hiring_univ_id %in% grad_programs) %>% 
    filter(placing_univ_id %in% grad_programs) %>%
    filter(position_type == 'Tenure-Track') %>%
    count(placing_univ) %>%
    arrange(desc(n)) %>% 
    rename(phd_program_placements = n) %>% 
    mutate(cum_phd_program_placements = cumsum(phd_program_placements), 
           share_phd_program_placements = cum_phd_program_placements / sum(phd_program_placements)) %>% 
    slice(1:20) %>% 
    knitr::kable(digits = 2)
placing_univ phd_program_placements cum_phd_program_placements share_phd_program_placements
Princeton University 12 12 0.07
Rutgers University 11 23 0.13
University of California, Berkeley 10 33 0.18
New York University 8 41 0.23
University of Chicago 7 48 0.27
Yale University 7 55 0.31
Harvard University 6 61 0.34
University of Arizona 6 67 0.37
University of Notre Dame 6 73 0.41
Columbia University 5 78 0.43
Stanford University 5 83 0.46
University of California, Irvine (LPS) 5 88 0.49
University of California, Los Angeles 5 93 0.52
University of Pittsburgh (HPS) 5 98 0.54
Brown University 4 102 0.57
Massachusetts Institute of Technology 4 106 0.59
University of North Carolina at Chapel Hill 4 110 0.61
University of Pittsburgh 4 114 0.63
University of Southern California 4 118 0.66
University of Wisconsin-Madison 4 122 0.68

Build network

## build network ----
## NB Only permanent placements
hiring_network = individual_df %>%
    filter(permanent) %>% 
    select(placing_univ_id, hiring_univ_id, everything()) %>%
    graph_from_data_frame(directed = TRUE, 
                          vertices = univ_df) %>%
    as_tbl_graph() %>%
    ## Clean nodes (universities) that aren't in the network
    mutate(degree = centrality_degree(mode = 'total')) %>%
    filter(degree > 0)

## 1 giant component contains almost all of the schools
components(hiring_network)$csize
##  [1] 737   2   2   4   2   2   1   2   2   2   2   2   1   2

Out- and in-centrality

to_reverse = function (graph) {
    ## Reverse edge direction
    if (!is.directed(graph))
        return(graph)
    e <- get.data.frame(graph, what="edges")
    ## swap "from" & "to"
    neworder <- 1:length(e)
    neworder[1:2] <- c(2,1)
    e <- e[neworder]
    names(e) <- names(e)[neworder]
    vertex_df = get.data.frame(graph, what = 'vertices')
    if (ncol(vertex_df) > 0) {
        return(as_tbl_graph(graph.data.frame(e, vertices = vertex_df)))
    } else {
        return(as_tbl_graph(graph.data.frame(e)))
    }
}

set.seed(42)
nzmin = function(x, na.rm = TRUE) {
    min(x[x > 0], na.rm = na.rm)
}
damping = .3
hiring_network = hiring_network %>%
    mutate(in_centrality = centrality_eigen(weights = NA, 
                                            directed = TRUE), 
           in_pr = centrality_pagerank(weights = NA, 
                                       directed = TRUE, 
                                       damping = damping, 
                                       personalized = NULL)) %>%
    morph(to_reverse) %>%
    mutate(out_centrality = centrality_eigen(weights = NA,
                                             directed = TRUE), 
           out_pr = centrality_pagerank(weights = NA, 
                                        directed = TRUE, 
                                        damping = damping, 
                                        personalized = NULL)) %>%
    unmorph() %>%
    ## Trim 0s to minimum non-zero values
    mutate(out_centrality = ifelse(out_centrality == 0,
                                   nzmin(out_centrality),
                                   out_centrality),
           in_centrality = ifelse(in_centrality == 0,
                                  nzmin(in_centrality),
                                  in_centrality))
## Warning: `as_quosure()` requires an explicit environment as of rlang 0.3.0.
## Please supply `env`.
## This warning is displayed once per session.
## Check relationship btwn unweighted multiedges and weighted edges
## Strong correlation, esp by approx rank and high/low prestige
## But some movement, both numerically and ordinally
# E(hiring_network)$weight = count_multiple(hiring_network)
# hiring_network_simp = simplify(hiring_network)
# 
# set.seed(42)
# V(hiring_network_simp)$out_centrality = hiring_network_simp %>%
#     graph.reverse() %>%
#     eigen_centrality(directed = TRUE) %>%
#     .$vector
# 
# tibble(name = V(hiring_network)$univ_name,
#        multi = V(hiring_network)$out_centrality,
#        simp = V(hiring_network_simp)$out_centrality) %>%
#     ggplot(aes(log10(simp), log10(multi))) + 
#     geom_point() +
#     stat_function(fun = function (x) x)

Exploring centrality scores

## exploring centrality scores ----

PageRank centrality is almost entirely determined by degree So we use eigenvector centrality instead

ggplot(hiring_network, aes(degree, log10(out_pr))) +
    geom_point(aes(text = univ_name)) +
    geom_smooth(method = 'lm')
## Warning: Ignoring unknown aesthetics: text

There are two clear groups of centrality scores, with high scores (in the range of ~10^-12 to 1) and low scores (10^-15 and smaller).

##
## NB there seem to be (small?) differences in scores (at the low end?) across runs of the centrality algorithm
ggplot(as_tibble(hiring_network), aes(out_centrality)) + 
    geom_density() + geom_rug() +
    scale_x_continuous(trans = 'log10', 
                       name = 'Out centrality') +
    theme_minimal()

ggsave(str_c(plots_folder, '02_out_centrality.png'), 
       height = 3, width = 6)

ggplot(as_tibble(hiring_network), aes(in_centrality)) + 
    geom_density() + geom_rug() +
    scale_x_continuous(trans = 'log10')

ggplot(as_tibble(hiring_network), 
       aes(out_centrality, in_centrality, 
           color = cluster_label, 
           text = univ_name)) +
    geom_jitter() + 
    scale_x_log10() + scale_y_log10()

plotly::ggplotly()
# ## Check stability of centrality calculation
# iterated_centrality = 1:500 %>%
#     map(~ {hiring_network %>%
#             graph.reverse() %>%
#             eigen_centrality(directed = TRUE, weights = NA) %>%
#             .$vector}) %>%
#     transpose() %>%
#     map(unlist) %>%
#     map(~ tibble(out_centrality = .)) %>%
#     bind_rows(.id = 'univ_id') %>%
#     group_by(univ_id) %>%
#     summarize(min = min(out_centrality),
#               max = max(out_centrality),
#               median = median(out_centrality))
# 
# ## Lots of variation among low-prestige
# ggplot(iterated_centrality, 
#        aes(reorder(univ_id, median), 
#            median)) + 
#     geom_pointrange(aes(ymin = min, ymax = max)) + 
#     scale_y_log10()
# 
# ## But extremely stable results among high-prestige
# iterated_centrality %>%
#     filter(min > 10^-12) %>%
#     ggplot(aes(reorder(univ_id, median), 
#                median)) + 
#     geom_pointrange(aes(ymin = min, ymax = max)) + 
#     scale_y_log10()

We completely rewire the network, preserving out-degree (number of new PhDs) and in-degree (number of permanent hires), but otherwise randomly matching job-seekers to positions. We then recalculate out-centrality. The plots below show the out-centrality distributions for each random rewiring, with the actual distribution in red. The distributions are always bimodal, indicating that the observed split into high- and low-centrality programs is due in part to the way PhD production and hiring are distributed across institutions. However, the high-centrality subset is never as small as in the actual hiring network. This indicates that bimodiality is not due only to the degree distributions. Some other factor is exacerbating the structural inequality in the hiring network.

set.seed(78910)
map(1:100, 
    ~ sample_degseq(degree(hiring_network, mode = 'out'), 
                    degree(hiring_network, mode = 'in'))
) %>%
    map(to_reverse) %>%
    map(eigen_centrality, directed = TRUE, weights = NULL) %>%
    map(~ .$vector) %>%
    map(log10) %>%
    map(density) %>%
    map(~ tibble(x = .$x, y = .$y)) %>%
    bind_rows(.id = 'iteration') %>%
    ggplot(aes(x, y)) + 
    geom_line(aes(group = iteration), alpha = .5) + 
    stat_density(data = as_tibble(hiring_network), geom = 'line',
                 aes(log10(out_centrality), ..density..),
                 color = 'red', size = 1)

Finding: High output is only modestly correlated w/ centrality. Programs like Leuven, New School, and Boston College produce lots of PhDs, but they aren’t placed into the high-centrality departments.

ggplot(as_tibble(hiring_network), 
       aes(total_placements, log10(out_centrality))) +
    geom_point(aes(text = univ_name)) +
    geom_smooth()
## Warning: Ignoring unknown aesthetics: text
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 581 rows containing non-finite values (stat_smooth).
## Warning: Removed 581 rows containing missing values (geom_point).

plotly::ggplotly()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 581 rows containing non-finite values (stat_smooth).
# univ_df %>%
#     mutate(out_centrality_log = log10(out_centrality)) %>%
#     filter(!is.na(out_centrality_log) &
#                out_centrality > 0 &
#                !is.na(total_placements)) %>%
#     select(total_placements, out_centrality_log) %>%
#     cor()
hiring_network %>%
    select(total_placements, out_centrality) %>%
    as_tibble() %>%
    filter(complete.cases(.)) %>%
    mutate(out_centrality = log10(out_centrality)) %>%
    cor()
##                  total_placements out_centrality
## total_placements         1.000000       0.498529
## out_centrality           0.498529       1.000000
ggplot(hiring_network, 
       aes(perm_placement_rank, log10(out_centrality))) +
    geom_point() +
    geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 581 rows containing non-finite values (stat_smooth).

## Warning: Removed 581 rows containing missing values (geom_point).

ggplot(hiring_network, 
       aes(log10(out_centrality), perm_placement_rate)) +
    geom_point(aes(text = univ_name)) +
    geom_smooth(method = 'lm')
## Warning: Ignoring unknown aesthetics: text
## Warning: Removed 581 rows containing non-finite values (stat_smooth).
## Warning: Removed 581 rows containing missing values (geom_point).

plotly::ggplotly()
## Warning: Removed 581 rows containing non-finite values (stat_smooth).
as_tibble(hiring_network) %>%
    select(total_placements, perm_placement_rate, 
           out_centrality) %>%
    filter(complete.cases(.)) %>%
    mutate(out_centrality = log10(out_centrality)) %>%
    cor()
##                     total_placements perm_placement_rate out_centrality
## total_placements          1.00000000         -0.01568778       0.498529
## perm_placement_rate      -0.01568778          1.00000000       0.189328
## out_centrality            0.49852895          0.18932795       1.000000
## Add centrality scores to univ_df
univ_df = hiring_network %>%
    as_tibble() %>%
    rename(univ_id = name) %>%
    left_join(univ_df, .)
## Joining, by = c("univ_id", "univ_name", "cluster_lvl1", "cluster_lvl2", "cluster_lvl3", "cluster_lvl4", "cluster_label", "city", "state", "country", "total_placements", "perm_placements", "perm_placement_rate", "frac_perm_placements", "cum_perm_placements", "frac_cum_perm_placements", "perm_placement_rank", "aos_diversity", "m_count", "w_count", "o_count", "gender_na_count", "frac_w")
## Movement down the prestige hierarchy
individual_df %>%
    filter(permanent) %>%
    left_join(select(univ_df, univ_id, out_centrality),
              by = c('placing_univ_id' = 'univ_id')) %>%
    left_join(select(univ_df, univ_id, out_centrality),
              by = c('hiring_univ_id' = 'univ_id')) %>%
    # filter(out_centrality.y > 10^-12) %>%
    select(person_id, aos_category,
           placing = out_centrality.x,
           hiring = out_centrality.y) %>%
    gather(variable, value, placing, hiring) %>%
    mutate(variable = fct_rev(variable)) %>%
    ggplot(aes(variable, log10(value))) +
    geom_point() +
    geom_line(aes(group = person_id), 
              alpha = .25) +
    xlab('university') +
    ylab('centrality (log10)') +
    theme_minimal()

ggsave(str_c(plots_folder, '02_prestige_movement.png'), 
       width = 3, height = 4)

## Diagonal line indicates where hiring program has the same centrality as the placing program.  
## Most placements are below this line, indicating that the centrality measure captures the idea that people typically are hired by programs with lower status
## The few placements above the line indicate that, even when individuals are hired by programs with higher status, the increase is relatively minor:  no more than 1 point on the log10 scale
individual_df %>% 
    filter(permanent) %>%
    left_join(select(univ_df, univ_id, out_centrality), 
              by = c('placing_univ_id' = 'univ_id')) %>%
    left_join(select(univ_df, univ_id, out_centrality),
              by = c('hiring_univ_id' = 'univ_id')) %>%
    filter(out_centrality.y > 10^-12) %>%
    select(person_id, 
           placing = out_centrality.x, 
           hiring = out_centrality.y) %>%
    ggplot(aes(log10(placing), log10(hiring))) + 
    geom_jitter() + 
    stat_function(fun = identity)

Community detection

## community detection ----
## Extract giant component
hiring_network_gc = hiring_network %>%
    components %>% 
    .$csize %>%
    {which(. == max(.))} %>%
    {components(hiring_network)$membership == .} %>%
    which() %>%
    induced_subgraph(hiring_network, .) %>%
    as_tbl_graph()
walk_len = rep(2:100, 1)
## ~24 sec
tic()
comm_stats = walk_len %>%
    map(~ cluster_walktrap(hiring_network_gc, steps = .x)) %>%
    map(~ list(sizes = sizes(.x), length = length(.x))) %>%
    map_dfr(~ tibble(H = -sum(.x$sizes/sum(.x$sizes) * log2(.x$sizes/sum(.x$sizes))),
                     n_comms = .x$length)) %>%
    mutate(walk_len = walk_len,
           delta_H = log2(n_comms) - H)
toc()
## 22.242 sec elapsed
ggplot(comm_stats, aes(walk_len, H)) +
    geom_point() +
    geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(comm_stats, aes(walk_len, n_comms)) +
    geom_point() +
    geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(comm_stats, aes(n_comms, delta_H)) +
    geom_text(aes(label = walk_len))

Select the walk length that minimizes both delta_H (flatter community distribution) and n_comms (fewer communities) using regression residuals

walk_length = lm(delta_H ~ n_comms, data = comm_stats) %>%
    augment(comm_stats) %>%
    arrange(.resid) %>%
    pull(walk_len) %>%
    first()

walk_length
## [1] 17
communities = cluster_walktrap(hiring_network_gc, steps = walk_length)
V(hiring_network_gc)$community = membership(communities)

## Summary of community sizes
hiring_network_gc %>% 
    as_tibble() %>% 
    count(community) %>% 
    pull(n) %>% 
    summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    4.00    7.50   12.71   13.00  156.00
univ_df = univ_df %>%
    left_join({hiring_network_gc %>%
            as_tibble() %>% 
            select(univ_id = name, community) %>%
            mutate(community = as.character(community))})
## Joining, by = "univ_id"
univ_df %>% 
    filter(!is.na(community)) %>% 
    count(community) %>% 
    ggplot(aes(n)) +
    geom_bar(aes(text = n)) +
    scale_x_continuous(name = 'Community size (# programs)')
## Warning: Ignoring unknown aesthetics: text

plotly::ggplotly()
cluster_vars = univ_df %>% 
    select(matches('cluster')) %>% 
    names()

univ_df %>% 
    filter(!is.na(community), !is.na('cluster_lvl4'))
## # A tibble: 737 x 29
##    univ_id univ_name cluster_lvl1 cluster_lvl2 cluster_lvl3 cluster_lvl4
##    <chr>   <chr>     <chr>        <chr>        <chr>        <chr>       
##  1 3002     North D… <NA>         <NA>         <NA>         <NA>        
##  2 117     Air Forc… <NA>         <NA>         <NA>         <NA>        
##  3 1228    Al-Asmar… <NA>         <NA>         <NA>         <NA>        
##  4 353     Albany S… <NA>         <NA>         <NA>         <NA>        
##  5 707     Alleghen… <NA>         <NA>         <NA>         <NA>        
##  6 385     American… <NA>         <NA>         <NA>         <NA>        
##  7 1169    American… <NA>         <NA>         <NA>         <NA>        
##  8 1134    American… <NA>         <NA>         <NA>         <NA>        
##  9 701     Amherst … <NA>         <NA>         <NA>         <NA>        
## 10 3394    Anderson… <NA>         <NA>         <NA>         <NA>        
## # … with 727 more rows, and 23 more variables: cluster_label <chr>,
## #   city <chr>, state <chr>, country <chr>, total_placements <int>,
## #   perm_placements <int>, perm_placement_rate <dbl>,
## #   frac_perm_placements <dbl>, cum_perm_placements <int>,
## #   frac_cum_perm_placements <dbl>, perm_placement_rank <dbl>,
## #   aos_diversity <dbl>, m_count <dbl>, w_count <dbl>, o_count <dbl>,
## #   gender_na_count <dbl>, frac_w <dbl>, degree <dbl>,
## #   in_centrality <dbl>, in_pr <dbl>, out_centrality <dbl>, out_pr <dbl>,
## #   community <chr>

Finding: There is no correlation between semantic clusters and topological communities.

univ_df %>%
    filter(!is.na(community), !is.na(cluster_label)) %>%
    select(community, cluster_label) %>%
    table() %>%
    chisq.test(simulate.p.value = TRUE)
## 
##  Pearson's Chi-squared test with simulated p-value (based on 2000
##  replicates)
## 
## data:  .
## X-squared = 426.35, df = NA, p-value = 0.08846
univ_df %>%
    filter(!is.na(community), !is.na(cluster_label)) %>%
    count(community, cluster_label) %>%
    rename(cluster_n = n) %>%
    group_by(community) %>%
    mutate(community_tot = sum(cluster_n), 
           cluster_frac = cluster_n / community_tot, 
           H = sum(cluster_frac * log2(cluster_frac))) %>%
    ungroup() %>%
    ggplot(aes(fct_reorder(community, community_tot, .desc = FALSE),
               cluster_n, fill = cluster_label)) + 
    geom_col() + 
    coord_flip() +
    xlab('Topological communities') +
    ylab('# of schools in community') +
    scale_fill_brewer(palette = 'Set1', 
                      name = 'Semantic\nclusters')

High-prestige universities

## high-prestige universities
## Start w/ Oxford, and work upstream
## Only need to go 11 or 12 steps to get closure
1:25 %>%
    map(~ make_ego_graph(hiring_network, order = .x, 
                         nodes = '512', mode = 'in')) %>%
    flatten() %>%
    map(~ length(V(.x))) %>%
    tibble(order = 1:length(.), 
           size = unlist(.)) %>%
    ggplot(aes(order, size)) + geom_point()

prestigious = make_ego_graph(hiring_network, order = 12, 
                             nodes = '512', 
                             mode = 'in') %>%
    .[[1]] %>%
    as_tbl_graph()

## How large is the high-prestige community?  
## 56 programs; 7% of all programs in the network; 
## 28% of programs with at least 1 placement in the dataset
length(V(prestigious))
## [1] 56
length(V(prestigious)) / length(V(hiring_network))
## [1] 0.0733945
length(V(prestigious)) / sum(univ_df$total_placements > 0, na.rm = TRUE)
## [1] 0.2758621
## What fraction of hires are within high-prestige?  
## 13% of all permanent hires; 7% of all hires
length(E(prestigious)) / length(E(hiring_network))
## [1] 0.1278793
length(E(prestigious)) / nrow(individual_df)
## [1] 0.06552707
# layout_prestigious = layout_with_focus(prestigious, 
#                                        which(V(prestigious)$univ_name == 'University of Oxford')) %>% 
#     `colnames<-`(c('x', 'y')) %>% 
#     as_tibble()
layout_prestigious = create_layout(prestigious, 'focus', 
                                   focus = which(V(prestigious)$univ_name == 'University of Oxford'))

# png(file = '02_prestigious_net.png', 
#     width = 11, height = 11, units = 'in', res = 400)
# set.seed(24)
ggraph(layout_prestigious) + 
    geom_node_label(aes(label = univ_name, 
                        size = log10(out_centrality), 
                        fill = log10(out_centrality)),
                    color = 'white') + 
    geom_edge_fan(arrow = arrow(length = unit(.01, 'npc')), 
                  alpha = .25,
                  strength = 5) +
    scale_size_continuous(range = c(.5, 3), guide = FALSE) +
    scale_fill_viridis(name = 'out centrality (log10)') +
    coord_cartesian(xlim = c(-3.5, 5.5), clip = 'on') +
    theme_graph() +
    theme(legend.position = 'bottom', 
          plot.margin = margin(0, unit = 'cm'))

ggsave(str_c(plots_folder, '02_prestigious_net.png'), 
       width = 6, height = 4, dpi = 600, scale = 2)
# dev.off()

## High-prestige = high centrality group
univ_df = univ_df %>%
    mutate(prestige = ifelse(univ_id %in% V(prestigious)$name, 
                             'high-prestige', 
                             'low-prestige'))
V(hiring_network)$prestigious = V(hiring_network)$out_centrality > 1e-12
V(hiring_network_gc)$prestigious = V(hiring_network_gc)$out_centrality > 1e-12

ggplot(univ_df, aes(prestige, log10(out_centrality))) + 
    geom_jitter()
## Warning: Removed 308 rows containing missing values (geom_point).

## Prestige status and clusters
## High-prestige are spread throughout, but #5 is mostly low-prestige
univ_df %>%
    filter(!is.na(cluster_label)) %>%
    ggplot(aes(cluster_label, color = prestige)) + 
    geom_point(stat = 'count') +
    geom_line(aes(group = prestige), stat = 'count')

## High-prestige are mostly in the largest community
univ_df %>%
    filter(!is.na(community)) %>%
    ggplot(aes(as.integer(community), fill = prestige)) + 
    geom_bar()

## What fraction of high-prestige graduates end up in high-prestige programs? 
## 24% of those w/ permanent placements
individual_df %>%
    filter(permanent) %>%
    left_join(univ_df, by = c('placing_univ_id' = 'univ_id')) %>%
    left_join(univ_df, by = c('hiring_univ_id' = 'univ_id')) %>%
    select(placing = prestige.x, hiring = prestige.y) %>%
    count(placing, hiring) %>%
    group_by(placing) %>%
    mutate(share = n / sum(n))
## # A tibble: 3 x 4
## # Groups:   placing [2]
##   placing       hiring            n share
##   <chr>         <chr>         <int> <dbl>
## 1 high-prestige high-prestige   161 0.239
## 2 high-prestige low-prestige    514 0.761
## 3 low-prestige  low-prestige    584 1

Finding: Median permanent placement rate for high-prestige programs is 22 points higher than for low-prestige programs. However, variation is also wide within each group;
This is also not yet controlling for graduation year, area, or demographics.

ggplot(univ_df, aes(prestige, perm_placement_rate, 
                    label = univ_name)) + 
    geom_sina(aes(size = total_placements), 
              alpha = .5) +
    geom_violin(color = 'red', draw_quantiles = .5, 
                fill = 'transparent') +
    # geom_jitter(aes(size = total_placements)) +
    scale_y_continuous(labels = scales::percent_format()) +
    ylab('permanent placement rate') +
    scale_size(name = 'total placements') +
    theme_minimal()
## Warning: Removed 868 rows containing non-finite values (stat_sina).
## Warning: Removed 868 rows containing non-finite values (stat_ydensity).

ggsave(str_c(plots_folder, '02_prestige_placement.png'), 
       width = 6, height = 4)
## Warning: Removed 868 rows containing non-finite values (stat_sina).

## Warning: Removed 868 rows containing non-finite values (stat_ydensity).
plotly::ggplotly()
## Warning: Removed 868 rows containing non-finite values (stat_sina).

## Warning: Removed 868 rows containing non-finite values (stat_ydensity).
univ_df %>%
    group_by(prestige) %>%
    summarize_at(vars(perm_placement_rate), 
                 funs(median, max, min), na.rm = TRUE)
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
## # A tibble: 2 x 4
##   prestige      median   max   min
##   <chr>          <dbl> <dbl> <dbl>
## 1 high-prestige  0.638     1 0.235
## 2 low-prestige   0.417     1 0

Stability of prestige status

By randomly rewiring the network, we can test the stability of the prestige categories to data errors and the short time frame of our data. In each of 500 permutations, we randomly rewire 10% of the edges in the permanent hiring network, then calculate out-centralities on the rewired network. Using a threshold of 10^-12 for high-prestige status, we count the fraction of rewired networks in which each program is high-prestige.

## ~6 sec
tic()
set.seed(13579)
permutations = 1:500 %>%
    ## Rewire 10% of edges
    map(~ {hiring_network %>%
            rewire(keeping_degseq(loops = TRUE, 
                                  niter = floor(.05 * length(E(.)))))}) %>%
    ## Calculate out-centralities
    map(~ {.x %>%
            to_reverse() %>%
            eigen_centrality(directed = TRUE, weights = NA) %>%
            .$vector}) %>%
    transpose() %>%
    map(unlist) %>%
    ## Fraction where program is high-prestige
    map(~ sum(. > 10^-12) / length(.)) %>%
    map(~ tibble(frac_high_prestige = .)) %>%
    bind_rows(.id = 'univ_id')
toc()
## 7.458 sec elapsed
## frac_high_prestige indicates the fraction of permutations in which the program was high-prestige
ggplot(permutations, aes(frac_high_prestige)) + 
    geom_density() + 
    geom_rug()

univ_df = left_join(univ_df, permutations)
## Joining, by = "univ_id"

Finding: Counterfactual high-prestige status is strongly correlated with centrality ranking.

ggplot(univ_df, aes(log10(out_centrality), frac_high_prestige)) + 
    geom_point(aes(text = univ_name)) +
    geom_smooth(method = 'lm')
## Warning: Ignoring unknown aesthetics: text
## Warning: Removed 308 rows containing non-finite values (stat_smooth).
## Warning: Removed 308 rows containing missing values (geom_point).

plotly::ggplotly()
## Warning: Removed 308 rows containing non-finite values (stat_smooth).
ggplot(univ_df, aes(perm_placements, frac_high_prestige, color = prestige)) + 
    geom_point(aes(text = univ_name)) +
    geom_smooth(method = 'lm')
## Warning: Ignoring unknown aesthetics: text
## Warning: Removed 889 rows containing non-finite values (stat_smooth).
## Warning: Removed 889 rows containing missing values (geom_point).

plotly::ggplotly()
## Warning: Removed 889 rows containing non-finite values (stat_smooth).

Finding: For low-prestige programs, counterfactual prestige seems to depend on the extent of the program’s downstream hiring network. Compare Boston College (19 permanent placements; 40% high prestige) to Leuven (18 permanent placements; 20% high prestige).

bc_leuven_net = make_ego_graph(hiring_network, order = 10, 
                           nodes = c('529', '38'),
                           mode = 'out') %>% 
    map(as_tbl_graph) %>%
    reduce(bind_graphs) %>% 
    mutate(perm_placements = ifelse(is.na(perm_placements), 0, perm_placements))

bc_leuven_layout = create_layout(bc_leuven_net, 'stress')

ggraph(bc_leuven_layout) + 
    geom_node_label(aes(label = univ_name, 
                        size = perm_placements, 
                        fill = perm_placements), 
                    color = 'white') + 
    geom_edge_fan(arrow = arrow(angle = 45, 
                                length = unit(.1, 'inches'), 
                                type = 'closed'), 
                  alpha = .3) +
    scale_size_continuous(range = c(1, 5),
                          # na.value = 1,
                          name = 'placements',
                          guide = FALSE) +
    scale_fill_viridis(na.value = 'black', name = 'permanent\nplacements') +
    theme_graph()

ggsave(str_c(plots_folder, '02_bc_leuven.png'), 
       height = 6, width = 6, dpi = 600, scale = 1.25)

Plotting

Coarser community structure

# large_comms = which(sizes(communities) > 20)
# V(hiring_network_gc)$community_coarse = ifelse(
#     V(hiring_network_gc)$community %in% large_comms, 
#     V(hiring_network_gc)$community, 
#     NA)

## Big giant hairy ball
## ~13 sec
# tic()
# layout_net = hiring_network %>%
#     layout_with_stress() %>%
#     `colnames<-`(c('x', 'y')) %>%
#     as_tibble()
# toc()

## Focus on Oxford
## GC only; ~7 sec
# tic()
# layout_net = create_layout(hiring_network_gc, 'focus', focus = 512)
# toc()

## Stress majorization
## ~2 sec
tic()
layout_net = create_layout(hiring_network_gc, 'stress')
toc()
## 1.915 sec elapsed
layout_net %>%
    ## NB neither of these preserve the layout_tbl_graph class, which breaks things when we get to ggraph()
    # filter(total_placements > 0) %>% 
    # induced_subgraph(which(degree(., mode = 'out') > 0)) %>% 
    ggraph() +
    geom_edge_parallel(arrow = arrow(length = unit(.02, 'npc'), 
                                angle = 15,
                                type = 'closed'),
                  # spread = 5, 
                  alpha = .05) +
    geom_node_point(aes(#size = log10(out_centrality),
                        alpha = log10(out_centrality),
                        # color = as.factor(community)
                        color = cluster_label
                        # color = log10(out_centrality)
    ), size = 2) +
    # scale_color_brewer(palette = 'Set1', na.value = 'black') +
    scale_color_viridis_d(na.value = 'black', name = 'Semantic cluster') +
    # scale_size_discrete(range = c(2, 6)) +
    scale_alpha_continuous(range = c(.2, 1), 
                           name = 'Out centrality (log10)') +
    # scale_size_continuous(range = c(1, 3)) +
    theme_graph()

ggsave(str_c(plots_folder, '02_hairball.png'), 
       width = 12, height = 12)

# hiring_network_gc %>%
#     induced_subgraph(!is.na(V(.)$cluster_lvl1)) %>%
#     ggraph() +
#     geom_node_label(aes(size = prestigious, 
#                         label = univ_name,
#                         # color = as.factor(community))) +
#                         fill = cluster_lvl4), color = 'black') +
#     geom_edge_fan(aes(linetype = aos_category, color = aos_category), 
#                   width = 1,
#                   arrow = arrow(length = unit(.01, 'npc')), 
#                   spread = 5) +
#     scale_edge_color_brewer(palette = 'Set1') +
#     scale_fill_brewer(palette = 'Set3', na.value = 'grey75') +
#     scale_size_discrete(range = c(3, 5)) +
#     theme_graph()

Chord diagram

# hiring_network_gc %>%
#     # induced_subgraph(which(degree(., mode = 'out') > 0)) %>%
#     ggraph(layout = 'linear', sort.by = 'community_coarse', circular = TRUE) +
#     geom_edge_arc(arrow = arrow(length = unit(.01, 'npc')), alpha = .1) +
#     geom_node_point(aes(size = prestigious, 
#                         # color = as.factor(community))) +
#                         color = cluster_lvl4)) +
#     scale_color_brewer(palette = 'Set1', guide = FALSE) +
#     theme_graph()

## Separate networks for each cluster
# cluster_ids = hiring_network %>% 
#     as_tibble() %>% 
#     split(., .$cluster_lvl4) %>% 
#     map(pull, name)
# 
# cluster_ids %>% 
#     map(~induced_subgraph(hiring_network, .))

# hiring_network %>% 
#     # filter(!is.na(cluster_lvl4)) %>% 
#     ggraph(layout = 'manual', 
#        node.positions = layout_net) +
#     geom_edge_fan(alpha = .1) +
#     geom_node_point(aes(color = cluster_label), show.legend = TRUE) +
#     # facet_nodes(vars(cluster_lvl4)) +
#     scale_color_viridis_d(na.value = 'grey90') +
#     theme_graph()

Save university-level data with network statistics

write_rds(univ_df, str_c(data_folder, '02_univ_net_stats.rds'))


sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] tictoc_1.0      ggforce_0.3.1   broom_0.5.2     ggraph_2.0.0   
##  [5] tidygraph_1.1.2 igraph_1.2.4    forcats_0.4.0   stringr_1.4.0  
##  [9] dplyr_0.8.2     purrr_0.3.2     readr_1.3.1     tidyr_0.8.3    
## [13] tibble_2.1.3    ggplot2_3.2.0   tidyverse_1.2.1
## 
## loaded via a namespace (and not attached):
##  [1] viridis_0.5.1      httr_1.4.0         jsonlite_1.6      
##  [4] viridisLite_0.3.0  modelr_0.1.4       shiny_1.2.0       
##  [7] assertthat_0.2.0   highr_0.7          cellranger_1.1.0  
## [10] yaml_2.2.0         ggrepel_0.8.0      pillar_1.3.1      
## [13] backports_1.1.3    lattice_0.20-38    glue_1.3.1        
## [16] digest_0.6.18      RColorBrewer_1.1-2 promises_1.0.1    
## [19] polyclip_1.10-0    rvest_0.3.4        colorspace_1.4-0  
## [22] htmltools_0.3.6    httpuv_1.4.5.1     pkgconfig_2.0.2   
## [25] haven_2.1.0        xtable_1.8-3       scales_1.0.0      
## [28] tweenr_1.0.1       later_0.8.0        generics_0.0.2    
## [31] farver_1.1.0       ellipsis_0.1.0     withr_2.1.2       
## [34] lazyeval_0.2.1     cli_1.1.0          magrittr_1.5      
## [37] crayon_1.3.4       readxl_1.3.1       mime_0.6          
## [40] evaluate_0.13      fansi_0.4.0        nlme_3.1-137      
## [43] MASS_7.3-51.1      xml2_1.2.0         tools_3.5.1       
## [46] data.table_1.12.0  hms_0.4.2          plotly_4.8.0      
## [49] munsell_0.5.0      compiler_3.5.1     rlang_0.4.0       
## [52] grid_3.5.1         rstudioapi_0.10    htmlwidgets_1.3   
## [55] crosstalk_1.0.0    labeling_0.3       rmarkdown_1.12    
## [58] gtable_0.2.0       graphlayouts_0.5.0 R6_2.4.0          
## [61] gridExtra_2.3      lubridate_1.7.4    knitr_1.22        
## [64] utf8_1.1.4         stringi_1.4.3      Rcpp_1.0.1        
## [67] tidyselect_0.2.5   xfun_0.5